Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R Luminex/", source)
theme_set(theme_light())library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
tmp <- fs::dir_map("R Luminex/", source)
theme_set(theme_light())VIBRANT_dropbox_dir <-
"/Users/laurasymul/Dropbox/Academia/Projects/VIBRANT Study Files/"
luminex_dir <-
str_c(VIBRANT_dropbox_dir, "15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING")plate_dirs <- list.files(luminex_dir, full.names = TRUE, pattern = "late [0-9]")The data for the first 9 plates were uploaded to the VIBRANT Dropbox by Lenine Liebenberg in February 2025 in the following directories:
plate_dirs |> str_remove(".*Projects/")[1] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/Plate 1_potential rerun"
[2] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/Plate 2"
[3] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 3"
[4] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 4"
[5] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 5"
[6] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 6"
[7] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 7"
[8] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 8"
[9] "VIBRANT Study Files/15_VIBRANT Luminex/VMRC VIBRANT LUMINEX Results_ONGOING/plate 9"
The data was made available as excel files. In these directories, we find the following files:
files <- list.files(plate_dirs, full.names = FALSE, pattern = "xlsx$")
files [1] "03022025_SFT_LUM_VIBRANT_plate 8 unoptimized.xlsx"
[2] "20250120 VMRC VIBRANT SFT-LUM PLATE 1.xlsx"
[3] "20250120_SFT_LUM_VIBRANT_plate 1 un optimized.xlsx"
[4] "20250120_SFT_LUM_VIBRANT_plate 1.xlsx"
[5] "20250122_SFT_LUM_VIBRANT_plate 2 un optimized.xlsx"
[6] "20250122_SFT_LUM_VIBRANT_plate 2.xlsx"
[7] "20250123_SFT_LUM_VIBRANT_plate 3 un optimized.xlsx"
[8] "20250123_SFT_LUM_VIBRANT_plate 3.xlsx"
[9] "20250124_SFT_LUM_VIBRANT_plate 4 un optimized.xlsx"
[10] "20250124_SFT_LUM_VIBRANT_plate 4.xlsx"
[11] "20250129_SFT_LUM_VIBRANT_plate 5_UNOPTIMIZED.xlsx"
[12] "20250129_SFT_LUM_VIBRANT_plate 5.xlsx"
[13] "20250130_SFT_LUM_VIBRANT_plate 6 unoptimized.xlsx"
[14] "20250130_SFT_LUM_VIBRANT_plate 6.xlsx"
[15] "20250131_SFT_LUM_VIBRANT_plate 7 Unoptimized.xlsx"
[16] "20250131_SFT_LUM_VIBRANT_plate 7.xlsx"
[17] "20250203_SFT_LUM_VIBRANT_plate 8.xlsx"
[18] "20250204_SFT_LUM_VIBRANT_plate 9 unoptimized.xlsx"
[19] "20250204_SFT_LUM_VIBRANT_plate 9.xlsx"
@Lenine: here, I’m assuming that I should only consider files that are of the form [date]_SFT_LUM_VIBRANT_plate [0-9]*.xlsx (i.e., not considering the “unoptimized” files). If this is not the case, please let me know :)
We load this data into R and store the excel sheets from the different plates in a single SummarizedExperiment object.
raw <-
read_luminex_excels(
dirs = plate_dirs,
pattern = "SFT_LUM_VIBRANT_plate [0-9]*\\.xlsx$",
name = "VIBRANT batch 1"
) # Reading data from
20250120_SFT_LUM_VIBRANT_plate 1.xlsx
Reading data from
20250122_SFT_LUM_VIBRANT_plate 2.xlsx
Reading data from
20250123_SFT_LUM_VIBRANT_plate 3.xlsx
Reading data from
20250124_SFT_LUM_VIBRANT_plate 4.xlsx
Reading data from
20250129_SFT_LUM_VIBRANT_plate 5.xlsx
Reading data from
20250130_SFT_LUM_VIBRANT_plate 6.xlsx
Reading data from
20250131_SFT_LUM_VIBRANT_plate 7.xlsx
Reading data from
20250203_SFT_LUM_VIBRANT_plate 8.xlsx
Reading data from
20250204_SFT_LUM_VIBRANT_plate 9.xlsx
raw# A SummarizedExperiment-tibble abstraction: 41,472 × 15
# Features=48 | Samples=864 | Assays=FI, FI_wo_background, raw_obs_conc,
# exp_conc
.feature .sample FI FI_wo_background raw_obs_conc exp_conc plate_nb
<chr> <chr> <dbl> <dbl> <chr> <dbl> <int>
1 Hu b-NGF (46) 001_A1 18215 18199 *6907,81 6266 1
2 Hu CTACK (72) 001_A1 5005 4993. 22098,91 31059 1
3 Hu Eotaxin (4… 001_A1 14461 14407. *3256,16 3121 1
4 Hu FGF basic … 001_A1 15876. 15862. <NA> 56685 1
5 Hu G-CSF (57) 001_A1 15179 15164. 98623,61 104548 1
6 Hu GM-CSF (34) 001_A1 21533 21511. *9584,69 7062 1
7 Hu GRO-a (61) 001_A1 6564 6504. <NA> 837309 1
8 Hu HGF (62) 001_A1 18560 18550 149685,67 177901 1
9 Hu IFN-a2 (20) 001_A1 14173 14162. <NA> 24795 1
10 Hu IFN-g (21) 001_A1 14023 14008 *28781,91 27687 1
# ℹ 38 more rows
# ℹ 8 more variables: plate_name <chr>, plate_row <chr>, plate_col <fct>,
# sample_type <fct>, type <chr>, description <chr>, sample_id <chr>,
# dilution <dbl>
plot_plate_design(raw) [[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
@Lenine: It looks like the sample names on plate 1 are different than the sample names on all other plates. I see that there is an additional excel file on the dropbox (Plate Layouts.xlsx) that provides both format. I assumed sample names should be harmonized across plates and used the format from plate 2+.
@Lenine: Sample on plate 4 well A5 has a weird name (LO[). On the excel file, it looks like it should be sample 6_14_23sa01 (SA_068-20-0153_V1_1000). Can you confirm?
samples <-
raw |>
as_tibble() |>
select(.sample, sample_type, sample_id) |>
distinct()@Lenine: There is also a small mismatch in the names of biological controls between different plates. I assumed that BIO CTRL is the same as BIOL CTRL. Let me know otherwise.
samples |> filter(sample_id |> str_detect("CTRL")) |> dplyr::count(sample_id)# A tibble: 3 × 2
sample_id n
<chr> <int>
1 BIO CTRL 2
2 BIOL CTRL 16
3 STD CTRL 18
raw <-
raw |>
mutate(sample_id = sample_id |> str_replace_all("BIO CTRL", "BIOL CTRL"))raw <-
raw |>
mutate(
sample_type =
case_when(
str_detect(sample_id, " CTRL") ~ "Positive control",
TRUE ~ sample_type
)
)rm(samples)We load the data contained in the first sheet (Plating) from the Plate Layouts.xlsx file. At the moment, we only consider the data in A2:X372.
manifest <-
read_xlsx(
str_c(luminex_dir, "/Plate Layouts.xlsx"),
sheet = "Plating", range = "A2:X372"
)manifest_col <- c("Sample_Vcode", "shorthand title", "alternate title", "Location", "Round 1 Final dilution factor")We subset and rename the column of interest (Sample_Vcode, shorthand title, alternate title, Location, Round 1 Final dilution factor):
manifest <-
manifest |>
select(manifest_col) |>
dplyr::rename(
sample_vcode = Sample_Vcode,
shorthand_title = `shorthand title`,
sample_id = `alternate title`,
location = Location,
dilution_manifest = `Round 1 Final dilution factor`
)
manifest |> head() |> gt(caption = "Top rows of the manifest data after column selection and renaming.")| sample_vcode | shorthand_title | sample_id | location | dilution_manifest |
|---|---|---|---|---|
| 068200008-1000 | 6_1_23sa01 | SA_068-20-0008_V1_1000 | SA | 4.8 |
| 068200008-1100 | 6_1_23sa02 | SA_068-20-0008_V2_1100 | SA | 7.2 |
| 068200008-1200 | 6_1_23sa03 | SA_068-20-0008_V3_1200 | SA | 6.9 |
| 068200008-1300 | 6_1_23sa04 | SA_068-20-0008_V4_1300 | SA | 4.8 |
| 068200008-1400 | 6_1_23sa05 | SA_068-20-0008_V5_1400 | SA | 5.5 |
| 068200008-1500 | 6_1_23sa06 | SA_068-20-0008_V6_1500 | SA | 4.9 |
We rename the plate 1 sample_id to match the actual (manifest) sample_id.
samples <-
colData(raw) |>
as.data.frame() |>
rownames_to_column(".sample") |>
as_tibble()
samples <-
samples |>
left_join(
manifest |>
select(shorthand_title, sample_id) |>
dplyr::rename(sample_id_manifest = sample_id, sample_id = shorthand_title),
by = join_by(sample_id)
)
should_be_zero <-
samples |>
filter((plate_nb != 1) & !is.na(sample_id_manifest)) |>
nrow()
if (should_be_zero > 0)
stop("There are samples with a manifest sample_id that are not on plate 1")
samples <-
samples |>
mutate(
sample_id =
case_when(
!is.na(sample_id_manifest) ~ sample_id_manifest,
TRUE ~ sample_id
)
)We also include the location and dilution_manifest columns in the colData(raw) data.
samples <-
samples |>
left_join(manifest, by = join_by(sample_id))
raw$sample_id <- samples$sample_id
raw$location <- samples$location
raw$dilution_manifest <- samples$dilution_manifest
rm(samples)We compare the dilution factors from the excel file and the manifest:
colData(raw) |>
as.data.frame() |>
rownames_to_column(".sample") |>
as_tibble() |>
ggplot() +
aes(x = dilution, y = dilution_manifest, col = plate_name) +
geom_abline(intercept = 0, slope = 1, col = "gray80") +
geom_point(alpha = 0.5, size = 0.2) +
coord_fixed() +
labs(
x = "Dilution factor from Luminex excel file\n(dilution sheet)",
y = "Dilution factor from manifest\nRound 1 Final dilution factor",
color = "Plate"
) +
facet_wrap(plate_name |> str_replace_all("_", " ") |> str_wrap(20) ~ ., nrow = 2) +
guides(col = "none")@Lenine: It looks like for most plates (except some values in plate 6 and 7?), the dilution factors have been rounded to the lowest integer. Is this correct? Should we use the actual value instead (the manifest value?). For now, I keep the values from the excel file, but let me know if we should use the manifest values instead.
We also add a pid (participant id) and a visit_nb that we extract from the sample_id:
raw$pid <-
ifelse(
str_detect(raw$sample_id, "^US|^SA"),
raw$sample_id |> str_remove("SA_068-20-|US_068-10-") |> str_remove("_V[0-9].*"),
NA
)
raw$visit_nb <-
raw$sample_id |> str_extract("V[0-9]*") |> str_remove("V") |> as.integer()# colData(raw) |>
# as.data.frame() |>
# dplyr::count(location, pid, visit_nb) |>
# filter(!is.na(pid), !is.na(visit_nb)) |>
# pivot_wider(names_from = visit_nb, values_from = n, names_prefix = "V", values_fill = 0) |>
# gt(caption = "Number of samples per location, participant, and visit.")
colData(raw) |>
as.data.frame() |>
dplyr::count(location, pid, visit_nb) |>
filter(!is.na(pid), !is.na(visit_nb)) |>
arrange(visit_nb) |>
mutate(
visit = str_c("V", visit_nb) |> fct_inorder(),
n = n |> factor()
) |>
ggplot(aes(x = visit, y = pid, fill = n)) +
geom_tile() +
facet_wrap(. ~ location, scales = "free") +
scale_fill_manual(name = "Number of samples", values = c("steelblue1", "steelblue3")) +
ylab("Participant IDs")raw |> plot_standard_curves() raw |> filter(.feature %in% "Hu IL-7 (74)") |>
plot_standard_curves() + facet_wrap(. ~ plate_name + .feature) + guides(col = "none") +
ggtitle("Standard curves for Hu IL-7 (74)")raw |> plot_dilution_factors()We exclude the cytokines that do not have any values across all plates
raw <- exclude_missing_analytes(raw)VIBRANT batch 1
0 excluded analytes:
raw <- impute_OOR(raw)VIBRANT batch 1
Added 2 assays
-`conc` with the imputed concentrations
-`value_type` with the type of value (observed, extrapolated, below LLOQ, above ULOQ)
and added @rowData with ULOQ, LLOQ, and median values for each analyte, as well as the mean concentrations across manufacturer's control replicates.
plot_conc_by_analyte(raw, color_by = "value_type")plot_conc_by_analyte(raw, color_by = "sample_type")plot_conc_by_analyte(raw, color_by = "plate_name")We next explore which transformation (no transformation, log, asinh, or square root) of the data is most suitable to stabilize the variance of the data.
check_transformation(raw, transf = "none") +
check_transformation(raw, transf = "asinh") +
check_transformation(raw, transf = "log") +
check_transformation(raw, transf = "sqrt")As expected, the log transformation appears to be the most suitable for stabilizing the variance of the data.
assay(raw, "conc_log2") <- raw |> assay("conc") |> log2()plot_marginals <- function(raw, by = "plate_nb", binwidth = 1){
tmp <- raw |> as_tibble()
tmp <- tmp |> bind_rows(tmp |> mutate(.feature = "all analytes"))
tmp$by <- str_c(by ," ", tmp[[by]])
min_c <- min(tmp$conc_log2[!is.infinite(tmp$conc_log2)], na.rm = TRUE)
max_c <- max(tmp$conc_log2[!is.infinite(tmp$conc_log2)], na.rm = TRUE)
tmp |>
mutate(
conc_log2_bin =
conc_log2 |>
cut(breaks = seq(min_c, max_c, by = binwidth))
) |>
dplyr::count(by, .feature, conc_log2_bin) |>
group_by(by, .feature) |>
mutate(f = n/sum(n)) |>
ggplot( ) +
aes(x = conc_log2_bin, y = by, fill = by, alpha = f) +
geom_tile() +
facet_wrap(.feature ~ ., scale = "free") +
labs(
x = "log2(concentration)\n(independent scales per analyte)",
y = by
) +
scale_x_discrete(breaks = NULL) +
guides(fill = "none") +
theme(strip.text.y = element_text(angle = 0))
}Per plate
plot_marginals(raw, by = "plate_nb")Per location
plot_marginals(raw, by = "location")Per visit
plot_marginals(raw, by = "visit_nb")Comparison of standards and controls concentrations across plates (x-axis) and analytes (facets).
raw |>
filter(sample_type %in% c("Standard", "Positive control", "Blank", "Manuf. control")) |>
ggplot() +
aes(x = plate_nb |> factor(), y = conc_log2, col = sample_id, shape = sample_type) +
geom_point(size = 1, alpha = 0.5) +
facet_wrap(.feature ~ ., nrow = 3) +
theme(legend.position = "bottom") +
xlab("Plate nb") +
ylab("log2(concentration)")Comparison of biological controls (colored dots) across plates (x-axis) and analytes (facets) (constrasted with the distribution of samples in light gray):
raw |>
filter((sample_id == "BIOL CTRL") | (sample_type == "Sample")) |>
mutate(plate_ctrl = ifelse(sample_id == "BIOL CTRL", plate_nb, NA) |> factor()) |>
ggplot() +
aes(x = plate_nb |> factor(), y = conc_log2, col = plate_ctrl, alpha = is.na(plate_ctrl), shape = (value_type == "Observed concentration")) +
geom_point(size = 1) +
facet_wrap(. ~ .feature |> str_remove("Hu ") |> str_remove("\\([0-9]*\\)"), nrow = 3) + # , scales = "free"
scale_x_discrete(breaks = NULL) +
scale_alpha_manual(values = c(1, 0.1)) +
xlab("Plate nb") +
ylab("log2(concentration)") +
scale_color_discrete("Plate nb") # + theme(strip.text.x = element_text(angle = 90, hjust = 0)) complete_samples <-
raw |>
as_tibble() |>
group_by(.sample) |>
summarize(ok = !any(is.na(conc_log2))) |>
mutate(i = row_number()) |>
filter(ok)
complete <- raw[, complete_samples$.sample]pca_res <-
prcomp(complete |> assay("conc_log2") |> t(), center = TRUE, scale = TRUE)factoextra::fviz_eig(pca_res, ncp = 48)As often with Luminex data, the first component explains most of the variance.
Excluding the 1st component, we can then see that the variance decreases rapidly.
factoextra::fviz_eig(pca_res, ncp = 48) +
geom_hline(yintercept = 100*1/48, linetype = 2) +
ylim(c(0, 20))plot_pca_scores <- function(pca_res, raw, axes = 1:2, col_by = "sample_type"){
scores <-
colData(raw) |>
as.data.frame() |>
rownames_to_column(".sample") |>
as_tibble() |>
left_join(
as_tibble(pca_res$x) |>
mutate(.sample = rownames(pca_res$x)),
by = join_by(.sample)
)
var <- factoextra::get_eig(pca_res)
xvar <- str_c("PC", axes[1])
yvar <- str_c("PC", axes[2])
ggplot(scores) +
aes(x = .data[[xvar]], y = .data[[yvar]], col = .data[[col_by]]) +
geom_vline(xintercept = 0, col = "gray") +
geom_hline(yintercept = 0, col = "gray") +
geom_point(alpha = 0.8) +
labs(
x = str_c(xvar, " (", var$variance.percent[axes[1]] |> round(1), "%)"),
y = str_c(yvar, " (", var$variance.percent[axes[2]] |> round(1), "%)")
) +
coord_fixed()
}Checking the scores and biplots by sample type:
g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "sample_type")
g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "sample_type")
g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "sample_type")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Focus on controls and standards:
complete <-
complete |>
mutate(control_id = ifelse(sample_type == "Sample", NA, sample_id))
g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "control_id")
g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "control_id")
g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "control_id")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Focus on the biological controls across plates:
complete <-
complete |>
mutate(plate_biol_ctrl = ifelse(sample_id == "BIOL CTRL", plate_name, NA))
g_12 <-
plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "plate_biol_ctrl") +
scale_color_discrete(na.value = "gray90")
g_23 <-
plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "plate_biol_ctrl") +
scale_color_discrete(na.value = "gray90")
g_45 <-
plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "plate_biol_ctrl") +
scale_color_discrete(na.value = "gray90")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Plate effects?
g_12 <- plot_pca_scores(pca_res, complete, axes = 1:2, col_by = "plate_name")
g_23 <- plot_pca_scores(pca_res, complete, axes = 2:3, col_by = "plate_name")
g_45 <- plot_pca_scores(pca_res, complete, axes = 4:5, col_by = "plate_name")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Plate effects when only considering samples (no controls or standards):
g_12 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "plate_name") +
stat_ellipse()
g_23 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "plate_name") +
stat_ellipse()
g_45 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "plate_name") +
stat_ellipse()
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Site differences?
g_12 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "location") +
stat_ellipse()
g_23 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "location") +
stat_ellipse()
g_45 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "location") +
stat_ellipse()
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Visit differences?
complete <- complete |> mutate(visit = str_c("V", visit_nb))
visit_colors <-
c(
"V1" = "red", "V2" = "green3",
"V3" = "steelblue1", "V4" = "steelblue2", "V5" = "steelblue3", "V6" = "steelblue4"
)g_12 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "visit") +
stat_ellipse() +
scale_color_manual(values = visit_colors)
g_23 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "visit") +
stat_ellipse() +
scale_color_manual(values = visit_colors)
g_45 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "visit") +
stat_ellipse() +
scale_color_manual(values = visit_colors)
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))Effect of the dilution factor?
g_12 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "dilution") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_23 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "dilution") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_45 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "dilution") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))g_12 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 1:2, col_by = "dilution_manifest") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_23 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 2:3, col_by = "dilution_manifest") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_45 <-
plot_pca_scores(pca_res, complete |> filter(sample_type == "Sample"), axes = 4:5, col_by = "dilution_manifest") +
scale_color_gradient(low = "gray80", high = "dodgerblue")
g_12 / (g_23 | g_45) +
plot_layout(guides = "collect", heights = c(1, 1.5))scores_long <-
colData(raw) |>
as.data.frame() |>
rownames_to_column(".sample") |>
as_tibble() |>
inner_join(
as_tibble(pca_res$x) |>
mutate(.sample = rownames(pca_res$x)) |>
pivot_longer(cols = -.sample, names_to = "PC", values_to = "value", names_prefix = "PC") |>
mutate(PC = PC |> as.integer()),
by = join_by(.sample)
)scores_long |>
filter(sample_type == "Sample", PC <= 6) |>
ggplot(aes(x = dilution_manifest, y = value)) +
geom_point(alpha = 0.5, size = 0.5) +
geom_smooth(method = "lm", formula = "y ~ x") +
facet_wrap(PC ~ ., scales = "free", labeller = label_both) +
ylab("PC score") +
xlab("Dilution factor from manifest")It looks like the correction for the dilution is a little too aggressive (samples with high dilution factors have higher concentrations on average).
tmp <-
raw |>
as_tibble() |>
group_by(sample_id, .feature) |>
mutate(n = n()) |>
ungroup() |>
filter(n > 1, !is.na(conc_log2))
tmp <-
left_join(
tmp |>
select(.feature, .sample, sample_id, sample_type, plate_nb, conc_log2) |>
dplyr::rename(.sample_r1 = .sample, plate_nb_r1 = plate_nb, conc_log2_r1 = conc_log2),
tmp |>
select(.feature, .sample, sample_id, plate_nb, conc_log2) |>
dplyr::rename(.sample_r2 = .sample, plate_nb_r2 = plate_nb, conc_log2_r2 = conc_log2),
by = join_by(.feature, sample_id),
relationship = "many-to-many"
) |>
filter(.sample_r1 != .sample_r2)tmp <-
tmp |>
mutate(
diff = conc_log2_r1 - conc_log2_r2,
abs_diff = abs(diff),
mean = (conc_log2_r1 + conc_log2_r2)/2,
sd = sqrt(((conc_log2_r1 - mean)^2 + (conc_log2_r2 - mean)^2)),
cv = sd/mean,
replicate_type = ifelse(plate_nb_r1 == plate_nb_r2, "within plate", "across plate")
)tmp |>
ggplot() +
aes(x = diff, col = replicate_type) +
geom_density(bw = 0.05) +
facet_wrap(.feature ~ ., scales = "free") +
scale_x_continuous(limits = c(-2.5, 2.5))tmp |>
filter(!is.infinite(diff)) |>
group_by(sample_type, plate_nb_r1, plate_nb_r2, replicate_type) |>
summarize(
mean_abs_diff = mean(abs_diff),
median_abs_diff = median(abs_diff),
.groups = "drop"
) |>
ggplot() +
aes(
x = plate_nb_r1 |> factor(),
y = median_abs_diff,
col = plate_nb_r2 |> factor(),
shape = replicate_type
) +
geom_point() +
facet_grid(sample_type ~ .) +
xlab("Plate nb") +
ylab("Median absolute difference between replicates") +
scale_color_discrete("Replicate\nplate nb") +
scale_shape_discrete("Replicate type")tmp |>
# filter(sample_type == "Sample") |>
ggplot() +
aes(x = plate_nb_r2 |> factor(), y = abs_diff, col = replicate_type) +
geom_boxplot(outlier.shape = NA) +
ggbeeswarm::geom_quasirandom(size = 0.5, alpha = 0.1) +
facet_grid(sample_type ~ plate_nb_r1, scales = "free", space = "free")